cd "C:\Users\jedwab\Desktop\World Settlement Footprint 3D 05092023\Stata"

***************************
***************************
*** EMPORIS INFORMATION ***
***************************
***************************

* The data for this version was downloaded on 09-07-2022

*******************************
*** COUNTRY POPULATION DATA ***
*******************************

* World Urbanization Prospects 2019 accessed 04302022
clear
import excel "pop_wup_04302022.xlsx", sheet("Sheet1") firstrow clear
ren country country_wup
gen countryname = country_wup
replace countryname = "Bolivia" if countryname == "Bolivia (Plurinational State of)"
replace countryname = "Brunei" if countryname == "Brunei Darussalam"
replace countryname = "Cape Verde" if countryname == "Cabo Verde"
replace countryname = "CuraÃ§ao" if countryname == "Curaçao"
replace countryname = "Czech Republic" if countryname == "Czechia"
replace countryname = "CÃ´te d'Ivoire" if countryname == "Côte d'Ivoire"
replace countryname = "Iran" if countryname == "Iran (Islamic Republic of)"
replace countryname = "Laos" if countryname == "Lao People's Democratic Republic"
replace countryname = "Macedonia" if countryname == "TFYR Macedonia"
replace countryname = "Moldova" if countryname == "Republic of Moldova"
replace countryname = "North Korea" if countryname == "Dem. People's Republic of Korea"
replace countryname = "Palestina" if countryname == "State of Palestine"
replace countryname = "Republic of Congo" if countryname == "Congo"
replace countryname = "Russia" if countryname == "Russian Federation"
replace countryname = "South Korea" if countryname == "Republic of Korea"
replace countryname = "Syria" if countryname == "Syrian Arab Republic"
replace countryname = "SÃ£o TomÃ© and PrÃ­ncipe" if countryname == "Sao Tome and Principe"
replace countryname = "Taiwan" if countryname == "China, Taiwan Province of China"
replace countryname = "Tanzania" if countryname == "United Republic of Tanzania"
replace countryname = "United States" if countryname == "United States of America"
replace countryname = "Venezuela" if countryname == "Venezuela (Bolivarian Republic of)"
replace countryname = "Vietnam" if countryname == "Viet Nam"
replace countryname = "Hong Kong" if countryname == "China, Hong Kong SAR"
replace countryname = "Antigua & Barbuda" if countryname == "Antigua and Barbuda"
replace countryname = "Bosnia-Herzegovina" if countryname == "Bosnia and Herzegovina"
replace countryname = "Central African Rep." if countryname == "Central African Republic"
replace countryname = "Democr. Rep. Congo" if countryname == "Democratic Republic of the Congo"
replace countryname = "Ivory Coast" if countryname == "CÃ´te d'Ivoire"
replace countryname = "Republic of the Congo" if countryname == "Republic of Congo"
replace countryname = "SÃ£o TomÃ© and Principe" if countryname == "SÃ£o TomÃ© and PrÃ­ncipe"
replace countryname = "Trinidad & Tobago" if countryname == "Trinidad and Tobago"
replace countryname = "U.S. Virgin Islands" if countryname == "United States Virgin Islands"
replace countryname = "U.S.A." if countryname == "United States"
replace countryname = "Falkland Islands" if countryname == "Falkland Islands (Malvinas)"
replace countryname = "Vatican City" if countryname == "Holy See"
replace countryname = "Palestinian Territories" if countryname == "Palestina"
replace countryname = "Sint Maarten" if countryname == "Sint Maarten (Dutch part)"
replace countryname = "Saint Vincent" if countryname == "Saint Vincent and the Grenadines"
replace countryname = "Saint Helena, Ascension and Tristan da Cunha" if countryname == "Saint Helena"
keep countryname y2020
ren y2020 totpop2020
sort countryname
save totpopwup, replace

* We verify that it merges well with our data
use country_list, clear
sort countryname
merge countryname using totpopwup
tab _m
tab countryname if _m == 1
tab countryname if _m == 2
drop if _m == 2
drop _m

**********************************
*** EMPORIS BUILDINGS DATABASE ***
**********************************

* Version from 09-08-2022
clear
import delimited "Emporis 09072022 - Sep 8, 2022.csv", delimiter(";") 
count
* 823,061 buildings 
desc 
sum heightestimated, d

***** WE CORRECT A FEW COUNTRY/CITY NAMES ***** 

replace cityname = "Tel Aviv - Yaffo" if countryname == "2019"
replace countryname = "Israel" if countryname == "2019"
replace countryname = "Turkey" if countryname == "Marmara Region"
replace countryname = "U.S.A." if countryname == "Navassa Island"

***** WE ADD THE COUNTRY'S TOTAL POPULATION *****

sort countryname
merge countryname using totpopwup
tab _m
tab countryname if _m == 1
drop if _m == 2
drop _m
replace totpop2020 = subinstr(totpop2020, " ", "",.)
destring totpop2020, replace 
* Population in 000s
sum totpop2020 if countryname == "China"
replace totpop2020 = 20 if countryname == "Bonaire"
replace totpop2020 = 64 if countryname == "Guernsey"
replace totpop2020 = 99 if countryname == "Jersey"
replace totpop2020 = 40 if countryname == "Saint Martin"

***** WE SELECT THE EXISTING/DEMOLISHED VARIABLES AND MODIFY THE CURRENT STATUS VARIABLES *****

tab currentstatus, m
split currentstatus, parse(" [") limit(1) gen(currentstatus_simple)
ren currentstatus status
ren currentstatus_simple statussimple
tab statussimple, m
gen existdemo = (statussimple == "existing" | statussimple == "demolished")
keep if existdemo == 1
count
* 781,136

*** DROP IF LONGITUDE MISSING OR CITY/METRO/COUNTRY NAME ***

drop if longitude == . | latitude == .
count
* 764,343

***** CONSTRUCTION TYPE *****

tab constructiont
* We keep all

***** HEIGHT *****

drop if heighttip == "" & heightstructu == "" & heightestimated == . & heightroof == .& heightmainroof == .& heighttopfloor == . & heightobsdeck == .& floorsoverground == "" 
count
* 748,651

*** RULE TO CREATE HEIGHT ***
codebook heightestimated heightstructu

** HEIGHT STRUCTURAL **
foreach X in heightstructu  {
desc `X', f
gen byte notnumeric = real(`X')==. /*makes indicator for obs w/o numeric values*/
tab notnumeric /*==1 where nonnumeric characters*/
tab `X' if notnumeric==1, m /*will show which have nonnumeric*/
}
replace heightstructu = "" if heightstructu == "Asia"
drop notnumeric
destring heightstructu, replace 
*codebook heightstructu
count if heightstructu != .
sum heightstructu, d

** HEIGHT ESTIMATED **
sum heightestimated, d
* this variable is clean
*corr heightstructu heightestimated
count if heightestimated != .

** MAIN HEIGHT VARIABLE IN EMPORIS **
* Structural/architectural (incl estimated)
gen height = heightstructu
replace height = heightestimated if height == .
codebook height, d
sum height, d
count if height == .

* BUT WE USE ROOF/TOP FLOOR *
** OTHER HEIGHT VARIABLES (by fit) **
* heighttopfloor
* heightmainroof 
* floors overground
* tip

* WE FIRST USE HEIGHTROOF *
sum heightroof, d
replace heightroof = . if heightroof > 4000
count if heightroof == .
count if height == .
count if height == . & heightroof != .
corr height heightroof
* 0.993 (N = 23,558)
corr heightstructu heightroof
* 0.993 (N = 22,396)

codebook countryname*
codebook cityname* 
* 16,102  in 214 countries

gen countrycity = countryname+""+cityname

*reg heightstructu heightroof, robust
*areg heightstructu heightroof, robust absorb(countryname)
*areg heightstructu heightroof, robust absorb(countrycity)

*reg height heightroof, robust
*areg height heightroof, robust absorb(countryname)
*areg height heightroof, robust absorb(countrycity)
* R2 = 0.99

* What we had before
*label var height "Height (m) - architectural (incl. est.)"
*gen height2 = height
*label var height2 "Height2 (m) - architectural (incl. est.) + others"
*areg height heightroof, robust absorb(countrycity)
*replace height2 = _b[_cons] + _b[heightroof]*heightroof if height == .
*count if height2 == .
* still missing 8,713
*drop height2

** STRUCTURAL HEIGHT **

ren height heightstructraw
label var heightstructraw "Structural height (m) (incl. estimated)"

** ROOF HEIGHT, INCLUDING INFORMATION ON STRUCTURAL HEIGHT **

codebook heightroof
* missing for most: 724,655/748,651
areg heightroof heightstructraw, robust absorb(countrycity)
gen heightroof2 = heightroof
replace heightroof2 = _b[_cons] + _b[heightstructraw]*heightstructraw if heightroof == .
count if heightroof2 == .
* still missing 8,713
gen heightroofraw = heightroof2 
label var heightroofraw "Roof height (m) - based on roofheight or structural (incl. estimated)"

sum heightstructraw heightroofraw

** WE THEN USE TOP OCCUPIED HEIGHT **

sum heightroofraw heighttopfloor, d
corr heightroofraw heighttopfloor

count if heightroof2 == .
* 8713
areg heightroofraw heighttopfloor, robust absorb(countrycity)
replace heightroof2 = _b[_cons] + _b[heighttopfloor]*heighttopfloor if heightroof2 == .
count if heightroof2 == .
* 8703

** NEXT, WE USE HEIGHT OF MAIN ROOF **

count if heightroof2 == . & heightmainroof != .
* adds 906
count if heightroof2 == .
* 8703
corr heightroofraw heightmainroof
* We don't use due to the very low correlation
* 0.05

** WE THEN USE TIP **

foreach X in heighttip {
desc `X', f
gen byte notnumeric = real(`X')==. /*makes indicator for obs w/o numeric values*/
tab notnumeric /*==1 where nonnumeric characters*/
tab `X' if notnumeric==1, m /*will show which have nonnumeric*/
}
replace heighttip = "" if heighttip == "Turkey"
drop notnumeric
destring heighttip, replace 
sum heighttip, d
* Issues with two buildings
replace heighttip = . if heighttip > 4000

count if heightroof2 == .
* 8703
count if heightroof2 == . & heighttip != .
* 189
corr heightroofraw heighttip
* 0.99

count if heightroof2 == .
* 8,703
areg heightroofraw heighttip, robust absorb(countrycity)
replace heightroof2 = _b[_cons] + _b[heighttip]*heighttip if heightroof2 == .
count if heightroof2 == .
* 8514 still missing
* difference 

** FINALLY, WE USE THE NUMBERS OF FLOORS OVERGROUND **

count if heightroof2 == .
* 8,514

* Relationship between height and floors
foreach X in floorsovergro  {
desc `X', f
gen byte notnumeric = real(`X')==. /*makes indicator for obs w/o numeric values*/
tab notnumeric /*==1 where nonnumeric characters*/
tab `X' if notnumeric==1, m /*will show which have nonnumeric*/
}
drop notnumeric
destring floorsovergro, replace 
sum floorsoverground, d
* ok

*areg heightroofraw floorsovergro, robust absorb(countrycity)
* Distribution number of floors for these 8503 below
sum floorsovergro if heightroof2 == ., d
*areg heightroofraw floorsovergro if floorsovergro <= 53, robust absorb(countrycity)
*areg heightroofraw floorsovergro if floorsovergro <= 31, robust absorb(countrycity)

corr heightroofraw floorsovergro

count if heightroof2 == .
* 8514 still missing
areg heightroofraw floorsovergro, robust absorb(countrycity)
replace heightroof2 = _b[_cons] + _b[floorsovergro]*floorsovergro if heightroof2 == .
count if heightroof2 == .
* 11 still missing, so 8514 - 11 = 8503
drop if heightroof2 == .
count
* 748,640

* We predict the number of floors when missing
count if floorsovergro != .
* 728,193
count if floorsovergro == .
* 20,447

sum heightroof2, d
* MAIN HEIGHT VARIABLE *
gen height = heightroof2

gen numfloors = floorsovergro
areg floorsovergro heightroofraw, robust absorb(countrycity)
gen cons = _b[_cons]
gen coefheight = _b[heightroofraw]
sum cons coefheight height
replace numfloors = cons + coefheight*height if numfloors == .
codebook numfloors

corr heightroofraw floorsovergro

sum heightroofraw, d

* WEB APPX FIG 
*twoway (scatter heightroofraw floorsovergro if heightroofraw != . & floorsovergro != ., mcolor(gs5) msize(tiny))(qfit heightroofraw floorsovergro if heightroofraw != . & floorsovergro != ., lcolor(gs8) lwidth(thick) lpattern(dash))(lfit heightroofraw floorsovergro if heightroofraw != . & floorsovergro != ., lcolor(black) lwidth(thick)), ytitle(Roof Height (meters), margin(medsmall)) xtitle(Number of Floors, margin(small)) xlabel(0(25)150) ylabel(0(100)700) legend(order(3 "Linear Fit" 2 "Quadratic Fit")) graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white) margin(vsmall))
*graph export height_floors.png, replace width(2620) height(1908)

* Relationship for different segments
areg heightroofraw floorsovergro if heightroofraw < 35, robust absorb(countrycity)
areg heightroofraw floorsovergro if heightroofraw >= 35 & heightroofraw < 100, robust absorb(countrycity)
areg heightroofraw floorsovergro if heightroofraw > 100, robust absorb(countrycity)

count if heightroof2 == .
count
* 748,640

count if buildingcosts != . & height >= 55
* 2,633
* but FID may not be available for all

***** YEAR VARIABLES *****

count
* 748,640
count if yearconstructionend != .
* 539,455
count if yearconstructionend == . & (yearconstructionstart != . | yearlastrenovation != .) 
* 22,868
count if yearconstructionstart == .
* 631,826
count if yearconstructionend == .
* 209,185
count if yearconstructionstart == . & yearconstructionend == .
* 187,886
tab statussimple, m

* WE NOW KEEP THE ONES WITH MISSING YEAR INFO
* TO CREATE A "MODIFIED 2012" AND A "MODIFIED 2019"
* We drop the ones with no year information
*drop if statussimple == "existing" & (yearconstructionstart == . & yearconstructionend == . & yearlastrenov == .)
* deleted 182074
*count
* 566,566

*keep ebn countryname cityname metroareaname latitude longitude status statussimple height year* destruction
*save tallbuild09122022, replace

* We check the variables make sense *

* First, demolition year
tab status
replace destruction = . if statussimple == "existing"
replace destruction = 2022 if status == "demolished [under demolition]"
*tab destruction if status == "demolished [destroyed]", m
* OK, they are all before 2022 
* If date is missing, we assume demolition was before 2012/2019, otherwise it would have been in the data set

* checking each year variable
*tab yearconstructionstart, m
*tab yearconstructionend, m
*tab yearlastrenovation, m

* we check that the year started <= the year ended
count if (yearconstructionend < yearconstructionstart) & yearconstructionstart != . & yearconstructionend != . & statussimple == "existing"
* 189 with construction end before construction starts
gen diff = yearconstructionstart-yearconstructionend if (yearconstructionend < yearconstructionstart) & yearconstructionstart != . & yearconstructionend != . & statussimple == "existing"
sum diff, d
* Difference more than 1 in many cases
drop diff
*keep if diff != .
*order yearconstructionstart yearconstructionend 
*gsort- height
*order height
* They tend to be smaller buildings, so should be OK
* For all of these, better to invert the years

gen end = yearconstructionend
gen start = yearconstructionstart
replace yearconstructionstart = end if (yearconstructionend < yearconstructionstart) & yearconstructionstart != . & yearconstructionend != . & statussimple == "existing"
replace yearconstructionend = start if (yearconstructionend < yearconstructionstart) & yearconstructionstart != . & yearconstructionend != . & statussimple == "existing"
drop start end 
count if (yearconstructionend < yearconstructionstart) & yearconstructionstart != . & yearconstructionend != . & statussimple == "existing"
* 0 now

* For non-existing buildings
count if (yearconstructionend < yearconstructionstart) & yearconstructionstart != . & yearconstructionend != . & statussimple != "existing"
* 1
* just one building
gen end = yearconstructionend
gen start = yearconstructionstart
replace yearconstructionstart = end if (yearconstructionend < yearconstructionstart) & yearconstructionstart != . & yearconstructionend != . & statussimple != "existing"
replace yearconstructionend = start if (yearconstructionend < yearconstructionstart) & yearconstructionstart != . & yearconstructionend != . & statussimple != "existing"
drop start end 
count if (yearconstructionend < yearconstructionstart) & yearconstructionstart != . & yearconstructionend != . & statussimple == "existing"
* 0 now

* What about for the renovation year
count if (yearlastrenovation < yearconstructionstart) & yearconstructionstart != . & yearlastrenovation != . & statussimple == "existing"
* 3 with renovation before construction starts
gen diff = yearconstructionstart-yearlastrenovation if (yearlastrenovation < yearconstructionstart) & yearconstructionstart != . & yearlastrenovation != . & statussimple == "existing"
sum diff, d
* Difference 1-2 in these cases, and only 3 buildings anyway
drop diff
* For these, we will rely on the construction start (or end if available)
* Hence, we only construct the year of renovation when the start year or end year is unavailable

** Analysis of construction times **

count
* 748,640
count if yearconstructionstart != .
* 116,814
count if yearconstructionend != .
* 539,455
count if yearconstructionstart != . & yearconstructionend == .
* 21,299

*areg yearconstructionend yearconstructionstart, robust absorb(countrycity)

* Time of construction
gen consttime = yearconstructionend-yearconstructionstart if yearconstructionend != . & yearconstructionstart != .  
sum consttime, d
* median 2; mean 3.1
sum consttime [w=height], d
* median 2; mean 3.6

gen heightcat = .
foreach X of numlist 0(50)300 {
replace heightcat = `X' if height > (`X'-50) & height <= `X'
}
replace heightcat = 350 if height > 300
tab heightcat, m
areg consttime i.heightcat, robust absorb(countrycity)
* max = 4 years
* we use 4 years
drop heightcat

sum yearconstructionstart if yearconstructionstart != . & yearconstructionend == ., d
sum height if yearconstructionstart != . & yearconstructionend == ., d

count if yearconstructionstart == 2008 & yearconstructionend == .
count if yearconstructionstart == 2009 & yearconstructionend == .

* Post-2000 due to new tech *
sum consttime if yearconstructionstart >= 2000, d
* median 2; mean 2.2; p75 = 3
sum consttime [w=height] if yearconstructionstart >= 2000, d
* median 2; mean 2.6; p75 = 3 

count if yearconstructionstart == . & yearconstructionend == . & yearlastrenovation != .
* 1569

* Year renovation now
gen diffrenov = yearlastrenovation - yearconstructionend
sum diffrenov, d
count if diffrenov <= 1
drop diffrenov

* in that case, the renovation is during the completion
* we replace by the competion date
replace yearlastrenovation = yearconstructionend if yearconstructionend != . & yearlastrenovation != . & yearlastrenovation < yearconstructionend
gen diffrenov = yearlastrenovation - yearconstructionend
sum diffrenov if diffrenov >= 1, d
count if diffrenov <= 1
sum yearlastrenov if yearconstructionstart == . & yearconstructionend == . & yearlastrenovation != ., d
* Hence, very likely existed if we have information on year of last renovation

* year of destruction relative to year of opening *
drop diff
gen diff = destruction - yearconstructionend
sum diff, d

* existing in ruins
* which year information we have? 
*codebook *year* if status == "existing [in ruins]"
count if status == "existing [in ruins]"
sum yearconstructionend if status == "existing [in ruins]", d
* if in ruins, assume most of the structure still there and height based on height of remaining structure

* Demolished with missing year destruction information
count if (status == "demolished [destroyed]" & destruction == .)
* 4,865
count if (status == "demolished [destroyed]" & destruction == . & (yearconstructionend != . | yearconstructionstart != . | yearlastrenovation != .))
* 3,010
count if (status == "demolished [destroyed]" & destruction == . & ((yearconstructionend != . & yearconstructionend > 2012) | (yearconstructionend == . & yearconstructionstart != . & yearconstructionstart > 2009)))
* 6
count if (status == "demolished [destroyed]" & destruction == . & ((yearconstructionend != . & yearconstructionend <= 2012) | yearconstructionstart != . | yearlastrenovation != .))
* 3,008

sum yearconstructionend if (status == "demolished [destroyed]" & destruction == . & (yearconstructionend != . & yearconstructionend <= 2012)), d
* 2960
sum diff, d
* Since median = 47 and mean = 53, we use 50 years
* Hence, for 2012, we keep if above 1962

tab yearconstructionend if (status == "demolished [destroyed]" & destruction == . & (yearconstructionend != . & yearconstructionend <= 2012))

** Selection 2012 **

*              demolished [destroyed] - OK (uses info on year demol)
*       demolished [under demolition] - Assume there 2012 / 2019
* -----
*                existing [completed] - OK (uses info on end, then start, then last renovation)
*                 existing [in ruins] - OK (uses info on end, then start, then last renovation)
*         existing [under renovation] - Assume there 2012 / 2019

***** FINAL SELECTION 2012 *****

*** We keep "existing buildings" or "existing buildings [in ruins]" ***
* completed before 2012
* OR construction started before 2009
* OR information on year of renovation, which suggests existed in 2012 
* OR under renovation as of 2022
gen sel2012_exist = 0
replace sel2012_exist = 1 if sel2012_exist == 0 & status == "existing [under renovation]"
replace sel2012_exist = 1 if sel2012_exist == 0 & (status == "existing [completed]" | status == "existing [in ruins]") & (yearconstructionend != . & yearconstructionend <= 2012)
replace sel2012_exist = 1 if sel2012_exist == 0 & (status == "existing [completed]" | status == "existing [in ruins]") & ((yearconstructionstart != . & yearconstructionstart <= 2009) | yearlastrenovation != .)
tab status sel2012_exist, m
 
*** We keep "demolished buildings" ***
* under demolition, which suggests existed in 2012
* demolished after 2012, which suggests existed in 2012
gen sel2012_demol = 0
replace sel2012_demol = 1 if sel2012_demol == 0 & (status == "demolished [under demolition]")
replace sel2012_demol = 1 if sel2012_demol == 0 & (status == "demolished [destroyed]" & destruction != . & destruction > 2012)
tab status sel2012_demol, m

gen sel2012 = (sel2012_exist == 1 | sel2012_demol == 1)
tab status sel2012, m

***** FINAL SELECTION 2019 *****

*** We keep "existing buildings" or "existing buildings [in ruins]" ***
* completed before 2019
* OR construction started before 2016
* OR information on year of renovation, which suggests existed in 2019
* OR under renovation as of 2022
gen sel2019_exist = 0
replace sel2019_exist = 1 if sel2019_exist == 0 & status == "existing [under renovation]"
replace sel2019_exist = 1 if sel2019_exist == 0 & (status == "existing [completed]" | status == "existing [in ruins]") & (yearconstructionend != . & yearconstructionend <= 2019)
replace sel2019_exist = 1 if sel2019_exist == 0 & (status == "existing [completed]" | status == "existing [in ruins]") & ((yearconstructionstart != . & yearconstructionstart <= 2016) | yearlastrenovation != .)
tab status sel2019_exist, m
 
*** We keep "demolished buildings" ***
* under demolition, which suggests existed in 2019
* demolished after 2012, which suggests existed in 2019
gen sel2019_demol = 0
replace sel2019_demol = 1 if sel2019_demol == 0 & (status == "demolished [under demolition]")
replace sel2019_demol = 1 if sel2019_demol == 0 & (status == "demolished [destroyed]" & destruction != . & destruction > 2019)
tab status sel2019_demol, m

gen sel2019 = (sel2019_exist == 1 | sel2019_demol == 1)
tab status sel2019, m

***** COMPARISON 2012 2019 *****

tab status sel2012, m
tab status sel2019, m
tab sel2012 sel2019, m
label var sel2012 "Sample of buildings existing in 2012"
label var sel2019 "Sample of buildings existing in 2019"

***** ADDING THE ONES WITH MISSING YEAR INFORMATION *****

* 2012
tab status sel2012 if (yearconstructionstart == . & yearconstructionend == . & yearlastrenovation == .), m
gen sel2012myi = sel2012
replace sel2012myi = 1 if (status == "existing [completed]" | status == "existing [in ruins]") & (yearconstructionstart == . & yearconstructionend == . & yearlastrenovation == .)
tab status sel2012 if (yearconstructionstart == . & yearconstructionend == . & yearlastrenovation == .), m
tab status sel2012myi if (yearconstructionstart == . & yearconstructionend == . & yearlastrenovation == .), m
tab sel2012 sel2012myi, m

* 2019
tab status sel2019 if (yearconstructionstart == . & yearconstructionend == . & yearlastrenovation == .), m
gen sel2019myi = sel2019
replace sel2019myi = 1 if (status == "existing [completed]" | status == "existing [in ruins]") & (yearconstructionstart == . & yearconstructionend == . & yearlastrenovation == .)
tab status sel2019 if (yearconstructionstart == . & yearconstructionend == . & yearlastrenovation == .), m
tab status sel2019myi if (yearconstructionstart == . & yearconstructionend == . & yearlastrenovation == .), m
tab sel2019 sel2019myi, m

* For 2012 and 2019, we always keep the missing year info ones 
drop sel2012 sel2019
ren sel2012myi sel2012
ren sel2019myi sel2019
tab sel2012 sel2019, m

***** VOLUME *****

count
*748,640
count if height != .
*748,640
count if length != .
*6138
count if width != .
*5655
count if volume != .
*1,12
count if grossfloorarea != .
*141,399 = 18.9% 
count if usablefloorarea != .
*11,929
count if grossfloorarea == . & usablefloorarea != .
* 7618 

corr grossfloorarea usablefloorarea
areg grossfloorarea usablefloorarea, absorb(countrycity) robust
gen coefufa = _b[usablefloorarea]
gen coefcons = _b[_cons]
count if grossfloorarea == . 
* 607,241
replace grossfloorarea = coefcons+coefufa*usablefloorarea if grossfloorarea == .
drop coefufa coefcons 
count if grossfloorarea == . 
* 599,623
count if grossfloorarea != . 
* 599,623

***** ANALYSIS ON THE EMPIRE STATE BUILDING *****

*corr height grossfloorarea

sum height if name == "Empire State Building"
* 381 m
* At its top floor, the Empire State Building stands 1,250 feet (380 meters) tall. Counting the spire and antenna, the building clocks in at a mighty 1,454 feet (443 meters).
sum floorsovergro if name == "Empire State Building"
* 102 floors
* height per floor = 381 / 102 = 3.735294118

sum grossfloorarea if name == "Empire State Building"
* 208,879 sq m
* https://en.wikipedia.org/wiki/Empire_State_Building
* other sources: Floor area	2,248,355 sq ft (208,879 m2) 

sum length width volume if name == "Empire State Building"
*Dimensions
*Other dimensions 424 ft (129.2 m) east–west; 187 ft (57.0 m) north–south
* length = 129 m
* width = 57 m
* area base = 7353 sq m
* volume = 1,048,159

* box volume recalculated
* 381 * 129.24 * 57 = 2,806,705
* volume = 1,048,159
gen base = length*width
sum base if name == "Empire State Building"
* 7367
* volume 37 million cubic feet = 1.04 million cubic meters
* https://images.adsttc.com/media/images/5576/ec62/e58e/ceaa/2a00/0115/medium_jpg/ESB-section_50350.jpg?1433857101

* area per floor
gen meanarea_sqm = grossfloorarea/floorsovergro
sum meanarea_sqm if name == "Empire State Building"
* 2,048 sq m per floor 
* less than 1st base of 7,367 or 2nd base of 4,329

* estimated volume
* gross floor area in sqkm * height in meters
* 2048 * 381 = 780,288
gen gfa_cubicm = meanarea_sqm*height
sum gfa_cubicm if name == "Empire State Building"
* 780,224.5

drop meanarea_sqm gfa_cubicm

count if height != .
* 566,566
count if numfloors != .
* 566,566
count if grossfloorarea != .
* 138,254 
count if grossfloorarea == . & usablefloorarea != .
* 7,194
corr grossfloorarea usablefloorarea
* 0.78
corr grossfloorarea usablefloorarea [w=grossfloorarea]
* 0.64
*twoway (scatter grossfloorarea usablefloorarea)
sum grossfloorarea usablefloorarea if grossfloorarea != . & usablefloorarea != .
* 4172
*twoway (scatter grossfloorarea usablefloorarea)

count if grossfloorarea != . & usablefloorarea != .
* 4172
count if grossfloorarea != . & usablefloorarea != . & grossfloorarea >= usablefloorarea
* 4003
count if grossfloorarea != . & usablefloorarea != . & grossfloorarea < usablefloorarea
* 169
*reg grossfloorarea usablefloorarea, robust
*areg grossfloorarea usablefloorarea, robust absorb(countrycity)

* Relationship GFA (should be in sq m) and height *
sum grossfloorarea, d
gsort- grossfloorarea
order cityname grossfloorarea height name
* Some issues with the top ones

* Verified the 20 top ones
* https://en.wikipedia.org/wiki/Chrysler_World_Headquarters_and_Technology_Center
replace grossfloorarea = 500000 if name == "Chrysler World Headquarters & Technology Center"
* https://www.skyscrapercity.com/threads/hangzhou-hangzhou-sports-park-stadium-80-000.1943781/
replace grossfloorarea = 400000 if name == "Hangzhou Sports Park Main Stadium"
* https://ko.wikipedia.org/wiki/GS%ED%83%80%EC%9B%8C
replace grossfloorarea = 141151 if name == "LG Gangnam Tower"
* https://en.wikipedia.org/wiki/Calgary_Municipal_Building
replace grossfloorarea = 74000 if name == "Calgary Municipal Building"

*twoway (scatter grossfloorarea height, mlabel(name))
*twoway (scatter grossfloorarea height if height >= 35, mlabel(name))
*twoway (scatter grossfloorarea height if height >= 55, mlabel(name))

corr grossfloorarea height
* 0.60
corr grossfloorarea height if height >= 35
* 0.56
corr grossfloorarea height if height >= 55
* 0.55
corr grossfloorarea height [w=height]
* 0.72
corr grossfloorarea height [w=height] if height >= 35
* 0.67
corr grossfloorarea height [w=height] if height >= 55
* 0.65

***** WE NOW CREATE THE VARIABLES WE NEED TO ESTIMATE THE VOLUME *****
* In particular, the GFA variable that is missing

drop coefheight 

* First, we need the volume of all with GFA  
gen usablevolume = grossfloorarea/numfloors*height
count if usablevolume != . 
*twoway (scatter usablevolume height, mlabel(name))
*twoway (scatter usablevolume height if height >= 15, mlabel(name))
*twoway (scatter usablevolume height if height >= 55, mlabel(name))
sum numfloors if height >= 55 ,d
*twoway (scatter  height numfloors if height >= 55, mlabel(name))
*twoway (scatter  height numfloors if height >= 55 & numfloors <= 50, mlabel(name))
drop usablevolume

gen heightperfloor = height/numfloors
sum heightperfloor if grossfloorarea != ., d

** USABLE VOLUME 35-75 2012 2019 **

* We drop the weird buildings with a lot of height per floor
replace grossfloorarea = . if heightperfloor > 6
* We calculate volume 
gen usablevolume = grossfloorarea/numfloors*height
count if height == .
count if numfloors == .
count if usablevolume != .

* WEB APPENDIX FIGURE B1
*twoway (scatter usablevolume height, mcolor(gs5) msize(tiny))(qfit usablevolume height, lcolor(gs8) lwidth(thick) lpattern(dash))(lfit usablevolume height if heightroofraw != . & floorsovergro != ., lcolor(black) lwidth(thick)), ytitle(Roof Height (meters), margin(medsmall)) xtitle(Number of Floors, margin(small)) xlabel(0(25)150) ylabel(0(100)700) legend(order(3 "Linear Fit" 2 "Quadratic Fit")) graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white) margin(vsmall))
*graph export volume_height.png, replace width(2620) height(1908)

** First, identify outliers, from 0 m **
areg usablevolume height, absorb(countrycity) robust
gen coefheight = _b[height]
gen coefcons = _b[_cons]
sum coefcons coefheight
gen volumeuse = usablevolume
replace volumeuse = coefcons+coefheight*height if volumeuse == .
drop coefheight coefcons
sum volumeuse if name == "Empire State Building"
* ok 
count
count if volumeuse == .

* Identify outliers due to over-estimated GFA (see analysis below)
* we don't need city FE for that one
drop diff
reg volumeuse height, robust
predict diff, resid
sum diff if diff > 0, d
sum volumeuse height diff if name == "Chubu Centrair International Airpor"
drop volumeuse*

* We replace by 0 
replace grossfloorarea = . if diff >= 140418.4 & diff != .
* 2,543 real changes made
drop usablevolume
gen usablevolume = grossfloorarea/numfloors*height
* we recreate the usable volume

areg usablevolume height, absorb(countrycity) robust
gen coefheight = _b[height]
gen coefcons = _b[_cons]
sum coefcons coefheight
gen volumeuse = usablevolume
replace volumeuse = coefcons+coefheight*height if volumeuse == .
drop coefheight coefcons

* WEB APPENDIX FIGURE B1 
*twoway (scatter volumeuse height, mcolor(gs5) msize(tiny))(qfit volumeuse height, lcolor(gs8) lwidth(thick) lpattern(dash))(lfit volumeuse height, lcolor(black) lwidth(thick)), ytitle(Volume (cubic m), margin(medsmall)) xtitle(Roof Height (m), margin(small)) legend(order(3 "Linear Fit" 2 "Quadratic Fit")) graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white) margin(vsmall))
*graph export volume_height.png, replace width(2620) height(1908)

drop volumeuse

** Now, we re-estimate the volume for each
foreach X of numlist 15 35(5)75 100 {
** Estimate GFA when missing **
areg usablevolume height if height >= `X', absorb(countrycity) robust
gen coefheight = _b[height]
gen coefcons = _b[_cons]
sum coefcons coefheight
gen volumeuse`X' = usablevolume if height >= `X'
replace volumeuse`X' = coefcons+coefheight*height if height >= `X' & volumeuse`X' == .
drop coefheight coefcons
sum volumeuse`X' if name == "Empire State Building"
* ok 780,225
gen volumeuse`X'_12 = volumeuse`X' if sel2012 == 1
gen volumeuse`X'_19 = volumeuse`X' if sel2019 == 1
}

***** ANALYSIS OF HEIGHT VARIABLES *****

* WEB APPX FIGURE B2 - WORLD
twoway (kdensity height, lcolor(black) lwidth(thick)), legend(off) xline(50, lcolor(gs5) lpattern(longdash) lwidth(medthick)) xtitle(Height (m), margin(medsmall)) ytitle(Density, margin(medsmall)) graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white) margin(vsmall)) xlabel(0(50)700)
graph export kernel_height_world.png, replace width(2620) height(1908)

count if sel2012 == 1
count if height >= 55 & sel2012 == 1
count if height >= 55 & sel2019 == 1
* 191,237
* 258,542

* WEB APPX FIGURE - CONTINENT
twoway (kdensity height if height <= 70, lcolor(black) lwidth(thick)), by(continentname) legend(off) xline(50, lcolor(gs5) lpattern(longdash) lwidth(medthick)) xtitle(Height (m), margin(medsmall)) ytitle(Density, margin(medsmall)) graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white) margin(vsmall)) xlabel(10(10)70)
graph export kernel_height_continents.png, replace width(2620) height(1908)

*codebook totpop2020
*gen height5 = round(height,5)
*tab height5
*keep height height5 countryname totpop2020
*save temp, replace

** TOTAL VOLUME 15-75 2012 2019 **

count if volume != .
* 1,112
*twoway (kdensity height if height >= 55 & volume != .)(kdensity height if height >= 55 & volume == .), legend(order(1 "Volume available" 2 "Missing"))
foreach X of numlist 55 {
gen volumetot`X' = volumeuse`X'
* We use the volume information if available
replace volumetot`X' = volume if volume != .
gen volumetot`X'_12 = volumetot`X' if sel2012 == 1
gen volumetot`X'_19 = volumetot`X' if sel2019 == 1
}

** DATA FOR THOMAS **

keep if sel2012 == 1 | sel2019 == 1
count
* 715,045

keep ebn countryname cityname countrycity latitude longitude height sel2012 sel2019 volumeuse*12* volumeuse*19* volumetot*12* volumetot*19* 
order ebn 
label var ebn "Building number in Emporis"
label var countryname "Country name in Emporis"
label var cityname "City name in Emporis"
label var countrycity "Country-city name"
label var height "Roof height (meters)"
label var sel2012 "Sample of buildings existing in 2012 (1=Yes;0=No)"
label var sel2019 "Sample of buildings existing in 2019 (1=Yes;0=No)"
* We label the volume variables 
foreach X of numlist 35(5)75 {
label var volumeuse`X'_12 "Usable volume c. 2012 (cubic m; threshold: `X'm+)"
label var volumeuse`X'_19 "Usable volume c. 2019 (cubic m; threshold: `X'm+)"
}
foreach X of numlist 55 {
label var volumetot`X'_12 "Total/usable volume c. 2012 (cubic m; threshold: `X'm+)"
label var volumetot`X'_19 "Total/usable volume c. 2019 (cubic m; threshold: `X'm+)"
}
order ebn latitude longitude countryname cityname countrycity height sel* volumeuse*_12 volumeuse*_19 volumetot*_12 volumetot*_19
tab sel2012 sel2019, m
* OK
sum volumeuse55_12 if sel2012 == 1, d
sum volumeuse55_12 if sel2012 == 0, d
* OK
sum volumeuse55_19 if sel2019 == 1, d
sum volumeuse55_19 if sel2019 == 0, d
*codebook ebn latitude longitude countryname cityname countrycity height numfloors sel*
drop *15_12 *100_12 *15_19 *100_19
foreach X of numlist 35(5)75 {
gen height`X'_12 = height if height >= `X' & sel2012 == 1
gen height`X'_19 = height if height >= `X' & sel2019 == 1
}
foreach X of numlist 35(5)75 {
label var height`X'_12 "Roof height c. 2012 (m; threshold: `X'm+)"
label var height`X'_19 "Roof height c. 2019 (m; threshold: `X'm+)"
}
order ebn latitude longitude countryname cityname countrycity sel* height height*_12 height*_19 volumeuse*_12 volumeuse*_19 volumetot*_12 volumetot*_19
save data_for_thomas, replace
count
* 715,045
* We export to Excel
use data_for_thomas, clear
desc, f
export delimited using "data_for_thomas_09222022.csv", replace
